1. Preparing R

Load packages

library("tidyverse")
## ── Attaching packages ─────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.1.0     ✔ purrr   0.2.5
## ✔ tibble  1.4.2     ✔ dplyr   0.7.8
## ✔ tidyr   0.8.2     ✔ stringr 1.3.1
## ✔ readr   1.3.1     ✔ forcats 0.3.0
## ── Conflicts ────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library("skimr")
## Warning: package 'skimr' was built under R version 3.5.2
library("here")
## here() starts at /Users/sjaganna/Desktop/CU-onedrive/OneDrive - The University of Colorado Denver/08-teaching/molb7910

This should install readr, tidyr, dplyr, purr, stringr, ggplot2, tibble, and forcats.

Set working directory

setwd(here())

2. Importing data

In base R, we use read.table, read.csv, read.delim, etc. to import data. The problem with these functions is that they often coerce characters and other data types into factors. The tidyverse package readr guesses the data type of each column and converts types when appropriate (but does NOT convert strings to factors automatically).

We will be using readr exclusively to import data in this class. The typical commands used to input data using readr are below:

Using some of the following arguments makes it easier to control how we want to input the data.

Note: All of these functions can also be used in an interactive manner via Environment > Import Dataset > From Text (readr)

A. RNA read counts

count_summary_rna <- read_delim(here("1-class1", 
                                      "data", 
                                     "count_summary_rna.tsv"), 
                                delim = "\t", 
                                escape_double = FALSE, 
                                trim_ws = TRUE, 
                                skip = 1)
## Parsed with column specification:
## cols(
##   Geneid = col_character(),
##   Chr = col_character(),
##   Start = col_character(),
##   End = col_character(),
##   Strand = col_character(),
##   Length = col_double(),
##   `0h_1_RNA_S13_umi.bam` = col_double(),
##   `0h_2_RNA_S14_umi.bam` = col_double(),
##   `0h_3_RNA_S15_umi.bam` = col_double(),
##   `14h_1_RNA_S22_umi.bam` = col_double(),
##   `14h_2_RNA_S23_umi.bam` = col_double(),
##   `14h_3_RNA_S24_umi.bam` = col_double(),
##   `4h_1_RNA_S16_umi.bam` = col_double(),
##   `4h_2_RNA_S17_umi.bam` = col_double(),
##   `4h_3_RNA_S18_umi.bam` = col_double(),
##   `8h_1_RNA_S19_umi.bam` = col_double(),
##   `8h_2_RNA_S20_umi.bam` = col_double(),
##   `8h_3_RNA_S21_umi.bam` = col_double()
## )

B. Experimental metadata

experiment_metadata <- read_delim(here("1-class1", 
                                       "data", 
                                       "experiment-metadata.txt"),
                                  "\t", 
                                  escape_double = FALSE, 
                                  trim_ws = TRUE)
## Parsed with column specification:
## cols(
##   `sample-name` = col_character(),
##   hour = col_double(),
##   treatment = col_character(),
##   experiment = col_character()
## )

C. ENSEMBL to Gene Symbol mapping file

ensembl_to_genesymbol <- read_csv(here("1-class1", 
                                       "data", 
                                       "ensembl_to_genesymbol.csv"))
## Parsed with column specification:
## cols(
##   `Ensembl Gene ID` = col_character(),
##   `Ensembl Transcript ID` = col_character(),
##   `Transcript length (including UTRs and CDS)` = col_double(),
##   `CDS Length` = col_double(),
##   `Associated Gene Name` = col_character()
## )

3. Getting familiar with the data

Some of the useful Data Frame Functions are as follows:
- head() - shows first 6 rows
- tail() - shows last 6 rows
- dim() - returns the dimensions of data frame (i.e. number of rows and number of columns)
- nrow() - number of rows
- ncol() - number of columns
- names() or colnames() - both show the names attribute for a data frame
- sapply(dataframe, class) - shows the class of each column in the data frame
- str() - structure of data frame - name, type and preview of data in each column - glimpse()

count_summary_rna #this works well with tbl_df data type. Others could take a LONG time  
## # A tibble: 20,025 x 18
##    Geneid Chr   Start End   Strand Length `0h_1_RNA_S13_u… `0h_2_RNA_S14_u…
##    <chr>  <chr> <chr> <chr> <chr>   <dbl>            <dbl>            <dbl>
##  1 ENST0… chr1… 9446… 9448… -;-;-…   2265            243              322  
##  2 ENST0… chr1… 9614… 9615… +;+;+…   1686             19               17  
##  3 ENST0… chr1  1014… 1014… +         473             45               53  
##  4 ENST0… chr1… 1041… 1041… +;+;+…   5060             42               50  
##  5 ENST0… chr1… 1082… 1082… -;-;-…   1148             17               19  
##  6 ENST0… chr1  1217… 1217… -         168             27.5             33.7
##  7 ENST0… chr1… 1217… 1217… -;-;-…   1070            158              170. 
##  8 ENST0… chr1… 1217… 1217… -;-;-…   1028            148.             162. 
##  9 ENST0… chr1  1232… 1233… +         866             11               21  
## 10 ENST0… chr1… 1255… 1255… -;-;-…    625             27.3             23.8
## # ... with 20,015 more rows, and 10 more variables:
## #   `0h_3_RNA_S15_umi.bam` <dbl>, `14h_1_RNA_S22_umi.bam` <dbl>,
## #   `14h_2_RNA_S23_umi.bam` <dbl>, `14h_3_RNA_S24_umi.bam` <dbl>,
## #   `4h_1_RNA_S16_umi.bam` <dbl>, `4h_2_RNA_S17_umi.bam` <dbl>,
## #   `4h_3_RNA_S18_umi.bam` <dbl>, `8h_1_RNA_S19_umi.bam` <dbl>,
## #   `8h_2_RNA_S20_umi.bam` <dbl>, `8h_3_RNA_S21_umi.bam` <dbl>
# display first six rows of a dataframe
head(count_summary_rna) 
## # A tibble: 6 x 18
##   Geneid Chr   Start End   Strand Length `0h_1_RNA_S13_u… `0h_2_RNA_S14_u…
##   <chr>  <chr> <chr> <chr> <chr>   <dbl>            <dbl>            <dbl>
## 1 ENST0… chr1… 9446… 9448… -;-;-…   2265            243              322  
## 2 ENST0… chr1… 9614… 9615… +;+;+…   1686             19               17  
## 3 ENST0… chr1  1014… 1014… +         473             45               53  
## 4 ENST0… chr1… 1041… 1041… +;+;+…   5060             42               50  
## 5 ENST0… chr1… 1082… 1082… -;-;-…   1148             17               19  
## 6 ENST0… chr1  1217… 1217… -         168             27.5             33.7
## # ... with 10 more variables: `0h_3_RNA_S15_umi.bam` <dbl>,
## #   `14h_1_RNA_S22_umi.bam` <dbl>, `14h_2_RNA_S23_umi.bam` <dbl>,
## #   `14h_3_RNA_S24_umi.bam` <dbl>, `4h_1_RNA_S16_umi.bam` <dbl>,
## #   `4h_2_RNA_S17_umi.bam` <dbl>, `4h_3_RNA_S18_umi.bam` <dbl>,
## #   `8h_1_RNA_S19_umi.bam` <dbl>, `8h_2_RNA_S20_umi.bam` <dbl>,
## #   `8h_3_RNA_S21_umi.bam` <dbl>
# display last six rows of a dataframe
tail(count_summary_rna)  
## # A tibble: 6 x 18
##   Geneid Chr   Start End   Strand Length `0h_1_RNA_S13_u… `0h_2_RNA_S14_u…
##   <chr>  <chr> <chr> <chr> <chr>   <dbl>            <dbl>            <dbl>
## 1 ENST0… chrX… 1550… 1550… +;+;+…    785             13.7             15.3
## 2 ENST0… chrX… 1550… 1550… +;+;+…    858             13.7             15.3
## 3 ENST0… chrX… 1552… 1552… +;+;+…    598             48               43.5
## 4 ENST0… chrX… 1552… 1552… +;+;+…    486             48               42.5
## 5 ENST0… chrX… 1554… 1554… -;-;-…   1269             15               13.5
## 6 ENST0… chrX… 1554… 1554… -;-;-…   1133             15               13.5
## # ... with 10 more variables: `0h_3_RNA_S15_umi.bam` <dbl>,
## #   `14h_1_RNA_S22_umi.bam` <dbl>, `14h_2_RNA_S23_umi.bam` <dbl>,
## #   `14h_3_RNA_S24_umi.bam` <dbl>, `4h_1_RNA_S16_umi.bam` <dbl>,
## #   `4h_2_RNA_S17_umi.bam` <dbl>, `4h_3_RNA_S18_umi.bam` <dbl>,
## #   `8h_1_RNA_S19_umi.bam` <dbl>, `8h_2_RNA_S20_umi.bam` <dbl>,
## #   `8h_3_RNA_S21_umi.bam` <dbl>
#Compactly display the internal structure of an R object
str(count_summary_rna)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 20025 obs. of  18 variables:
##  $ Geneid               : chr  "ENST00000327044.6_51_2298" "ENST00000338591.7_360_2034" "ENST00000379389.4_176_647" "ENST00000379370.6_1158_6186" ...
##  $ Chr                  : chr  "chr1;chr1;chr1;chr1;chr1;chr1;chr1;chr1;chr1;chr1;chr1;chr1;chr1;chr1;chr1;chr1;chr1;chr1;chr1" "chr1;chr1;chr1;chr1;chr1;chr1;chr1;chr1;chr1;chr1;chr1" "chr1" "chr1;chr1;chr1;chr1;chr1;chr1;chr1;chr1;chr1;chr1;chr1;chr1;chr1;chr1;chr1;chr1;chr1;chr1;chr1;chr1;chr1;chr1;c"| __truncated__ ...
##  $ Start                : chr  "944696;945056;945517;946172;946401;948130;948489;951126;951999;952411;953174;953781;954003;955922;956094;956893"| __truncated__ "961438;961628;961825;962354;962703;963108;963336;963919;964106;964348;964962" "1014005" "1041633;1041955;1043238;1043537;1043822;1044108;1044333;1045160;1045358;1045732;1045963;1046159;1046396;1046819"| __truncated__ ...
##  $ End                  : chr  "944800;945146;945653;946286;946545;948232;948603;951238;952139;952600;953288;953892;954082;956013;956215;957025"| __truncated__ "961552;961750;962047;962471;962917;963253;963504;964008;964180;964530;965190" "1014477" "1041702;1042162;1043457;1043732;1044023;1044257;1044439;1045277;1045523;1045876;1046088;1046265;1046735;1046957"| __truncated__ ...
##  $ Strand               : chr  "-;-;-;-;-;-;-;-;-;-;-;-;-;-;-;-;-;-;-" "+;+;+;+;+;+;+;+;+;+;+" "+" "+;+;+;+;+;+;+;+;+;+;+;+;+;+;+;+;+;+;+;+;+;+;+;+;+;+;+;+;+;+;+" ...
##  $ Length               : num  2265 1686 473 5060 1148 ...
##  $ 0h_1_RNA_S13_umi.bam : num  243 19 45 42 17 ...
##  $ 0h_2_RNA_S14_umi.bam : num  322 17 53 50 19 ...
##  $ 0h_3_RNA_S15_umi.bam : num  303 15 48 52 25 ...
##  $ 14h_1_RNA_S22_umi.bam: num  177 9 11 32 3 ...
##  $ 14h_2_RNA_S23_umi.bam: num  177 5 5 31 0 ...
##  $ 14h_3_RNA_S24_umi.bam: num  239 8 14 30 2 ...
##  $ 4h_1_RNA_S16_umi.bam : num  283 10 37 43 17 ...
##  $ 4h_2_RNA_S17_umi.bam : num  229 12 39 47 13 ...
##  $ 4h_3_RNA_S18_umi.bam : num  251 19 46 47 21 ...
##  $ 8h_1_RNA_S19_umi.bam : num  267 13 38 44 12 ...
##  $ 8h_2_RNA_S20_umi.bam : num  257 18 36 44 15 ...
##  $ 8h_3_RNA_S21_umi.bam : num  231 12 22 50 10 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   Geneid = col_character(),
##   ..   Chr = col_character(),
##   ..   Start = col_character(),
##   ..   End = col_character(),
##   ..   Strand = col_character(),
##   ..   Length = col_double(),
##   ..   `0h_1_RNA_S13_umi.bam` = col_double(),
##   ..   `0h_2_RNA_S14_umi.bam` = col_double(),
##   ..   `0h_3_RNA_S15_umi.bam` = col_double(),
##   ..   `14h_1_RNA_S22_umi.bam` = col_double(),
##   ..   `14h_2_RNA_S23_umi.bam` = col_double(),
##   ..   `14h_3_RNA_S24_umi.bam` = col_double(),
##   ..   `4h_1_RNA_S16_umi.bam` = col_double(),
##   ..   `4h_2_RNA_S17_umi.bam` = col_double(),
##   ..   `4h_3_RNA_S18_umi.bam` = col_double(),
##   ..   `8h_1_RNA_S19_umi.bam` = col_double(),
##   ..   `8h_2_RNA_S20_umi.bam` = col_double(),
##   ..   `8h_3_RNA_S21_umi.bam` = col_double()
##   .. )
#Get A Glimpse Of Your Data
glimpse(count_summary_rna)
## Observations: 20,025
## Variables: 18
## $ Geneid                  <chr> "ENST00000327044.6_51_2298", "ENST0000...
## $ Chr                     <chr> "chr1;chr1;chr1;chr1;chr1;chr1;chr1;ch...
## $ Start                   <chr> "944696;945056;945517;946172;946401;94...
## $ End                     <chr> "944800;945146;945653;946286;946545;94...
## $ Strand                  <chr> "-;-;-;-;-;-;-;-;-;-;-;-;-;-;-;-;-;-;-...
## $ Length                  <dbl> 2265, 1686, 473, 5060, 1148, 168, 1070...
## $ `0h_1_RNA_S13_umi.bam`  <dbl> 243.00, 19.00, 45.00, 42.00, 17.00, 27...
## $ `0h_2_RNA_S14_umi.bam`  <dbl> 322.00, 17.00, 53.00, 50.00, 19.00, 33...
## $ `0h_3_RNA_S15_umi.bam`  <dbl> 303.00, 15.00, 48.00, 52.00, 25.00, 36...
## $ `14h_1_RNA_S22_umi.bam` <dbl> 177.00, 9.00, 11.00, 32.00, 3.00, 22.5...
## $ `14h_2_RNA_S23_umi.bam` <dbl> 177.00, 5.00, 5.00, 31.00, 0.00, 29.17...
## $ `14h_3_RNA_S24_umi.bam` <dbl> 239.00, 8.00, 14.00, 30.00, 2.00, 27.3...
## $ `4h_1_RNA_S16_umi.bam`  <dbl> 283.00, 10.00, 37.00, 43.00, 17.00, 31...
## $ `4h_2_RNA_S17_umi.bam`  <dbl> 229.00, 12.00, 39.00, 47.00, 13.00, 40...
## $ `4h_3_RNA_S18_umi.bam`  <dbl> 251.00, 19.00, 46.00, 47.00, 21.00, 23...
## $ `8h_1_RNA_S19_umi.bam`  <dbl> 267.00, 13.00, 38.00, 44.00, 12.00, 33...
## $ `8h_2_RNA_S20_umi.bam`  <dbl> 257.00, 18.00, 36.00, 44.00, 15.00, 36...
## $ `8h_3_RNA_S21_umi.bam`  <dbl> 231.00, 12.00, 22.00, 50.00, 10.00, 29...
glimpse(experiment_metadata)
## Observations: 12
## Variables: 4
## $ `sample-name` <chr> "0h_1_RNA_S13_umi.bam", "0h_2_RNA_S14_umi.bam", ...
## $ hour          <dbl> 0, 0, 0, 4, 4, 4, 8, 8, 8, 14, 14, 14
## $ treatment     <chr> "doxcycyline", "doxcycyline", "doxcycyline", "do...
## $ experiment    <chr> "rnaseq", "rnaseq", "rnaseq", "rnaseq", "rnaseq"...

Summary

A generic function used to produce result summaries of the results of various model fitting functions.

summary(count_summary_rna)
##     Geneid              Chr               Start          
##  Length:20025       Length:20025       Length:20025      
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##      End               Strand              Length      
##  Length:20025       Length:20025       Min.   :     9  
##  Class :character   Class :character   1st Qu.:   660  
##  Mode  :character   Mode  :character   Median :  1235  
##                                        Mean   :  1635  
##                                        3rd Qu.:  2053  
##                                        Max.   :108334  
##  0h_1_RNA_S13_umi.bam 0h_2_RNA_S14_umi.bam 0h_3_RNA_S15_umi.bam
##  Min.   :    0.0      Min.   :    0.00     Min.   :    0.0     
##  1st Qu.:   13.0      1st Qu.:   14.67     1st Qu.:   14.0     
##  Median :   38.0      Median :   42.00     Median :   39.5     
##  Mean   :  120.8      Mean   :  133.27     Mean   :  125.1     
##  3rd Qu.:  100.8      3rd Qu.:  111.00     3rd Qu.:  103.5     
##  Max.   :24374.0      Max.   :29852.00     Max.   :27929.0     
##  14h_1_RNA_S22_umi.bam 14h_2_RNA_S23_umi.bam 14h_3_RNA_S24_umi.bam
##  Min.   :    0.00      Min.   :    0.00      Min.   :    0.00     
##  1st Qu.:    7.00      1st Qu.:    6.00      1st Qu.:    7.33     
##  Median :   27.00      Median :   23.00      Median :   27.57     
##  Mean   :  119.78      Mean   :   99.32      Mean   :  116.78     
##  3rd Qu.:   91.67      3rd Qu.:   75.00      3rd Qu.:   89.00     
##  Max.   :26483.50      Max.   :23692.50      Max.   :24847.50     
##  4h_1_RNA_S16_umi.bam 4h_2_RNA_S17_umi.bam 4h_3_RNA_S18_umi.bam
##  Min.   :    0.0      Min.   :    0.00     Min.   :    0.0     
##  1st Qu.:   12.5      1st Qu.:   12.17     1st Qu.:   11.5     
##  Median :   37.5      Median :   36.32     Median :   34.0     
##  Mean   :  121.9      Mean   :  117.99     Mean   :  109.9     
##  3rd Qu.:  100.8      3rd Qu.:   97.67     3rd Qu.:   92.0     
##  Max.   :28571.0      Max.   :27655.00     Max.   :25130.0     
##  8h_1_RNA_S19_umi.bam 8h_2_RNA_S20_umi.bam 8h_3_RNA_S21_umi.bam
##  Min.   :    0.00     Min.   :    0.0      Min.   :    0.00    
##  1st Qu.:   10.33     1st Qu.:   11.0      1st Qu.:   10.25    
##  Median :   33.33     Median :   35.5      Median :   33.50    
##  Mean   :  120.41     Mean   :  125.1      Mean   :  118.88    
##  3rd Qu.:   95.00     3rd Qu.:  100.5      3rd Qu.:   95.50    
##  Max.   :27141.00     Max.   :27735.0      Max.   :24991.00
summary(count_summary_rna$`0h_1_RNA_S13_umi.bam`)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0    13.0    38.0   120.8   100.8 24374.0

An alternative to summary is skim.

library(skimr)
skim(count_summary_rna)
## Skim summary statistics
##  n obs: 20025 
##  n variables: 18 
## 
## ── Variable type:character ─────────────────────────────────────────────────────────────────
##  variable missing complete     n min  max empty n_unique
##       Chr       0    20025 20025   4 1809     0     1050
##       End       0    20025 20025   6 3619     0    19249
##    Geneid       0    20025 20025  22   29     0    20025
##     Start       0    20025 20025   6 3619     0    19283
##    Strand       0    20025 20025   1  723     0      165
## 
## ── Variable type:numeric ───────────────────────────────────────────────────────────────────
##               variable missing complete     n    mean      sd p0    p25
##   0h_1_RNA_S13_umi.bam       0    20025 20025  120.81  538.88  0  13   
##   0h_2_RNA_S14_umi.bam       0    20025 20025  133.27  603.79  0  14.67
##   0h_3_RNA_S15_umi.bam       0    20025 20025  125.13  569.79  0  14   
##  14h_1_RNA_S22_umi.bam       0    20025 20025  119.78  478.79  0   7   
##  14h_2_RNA_S23_umi.bam       0    20025 20025   99.32  418.02  0   6   
##  14h_3_RNA_S24_umi.bam       0    20025 20025  116.78  475.03  0   7.33
##   4h_1_RNA_S16_umi.bam       0    20025 20025  121.95  536.99  0  12.5 
##   4h_2_RNA_S17_umi.bam       0    20025 20025  117.99  526     0  12.17
##   4h_3_RNA_S18_umi.bam       0    20025 20025  109.89  482.37  0  11.5 
##   8h_1_RNA_S19_umi.bam       0    20025 20025  120.41  512.3   0  10.33
##   8h_2_RNA_S20_umi.bam       0    20025 20025  125.1   521.28  0  11   
##   8h_3_RNA_S21_umi.bam       0    20025 20025  118.88  496.42  0  10.25
##                 Length       0    20025 20025 1634.98 2204.56  9 660   
##      p50     p75     p100     hist
##    38     100.83  24374   ▇▁▁▁▁▁▁▁
##    42     111     29852   ▇▁▁▁▁▁▁▁
##    39.5   103.5   27929   ▇▁▁▁▁▁▁▁
##    27      91.67  26483.5 ▇▁▁▁▁▁▁▁
##    23      75     23692.5 ▇▁▁▁▁▁▁▁
##    27.57   89     24847.5 ▇▁▁▁▁▁▁▁
##    37.5   100.83  28571   ▇▁▁▁▁▁▁▁
##    36.32   97.67  27655   ▇▁▁▁▁▁▁▁
##    34      92     25130   ▇▁▁▁▁▁▁▁
##    33.33   95     27141   ▇▁▁▁▁▁▁▁
##    35.5   100.5   27735   ▇▁▁▁▁▁▁▁
##    33.5    95.5   24991   ▇▁▁▁▁▁▁▁
##  1235    2053    108334   ▇▁▁▁▁▁▁▁

Table

Uses the cross-classifying factors to build a contingency table of the counts at each combination of factor levels.

table(experiment_metadata$hour)
## 
##  0  4  8 14 
##  3  3  3  3

Plotting

The two simplest plots to use to get a sense of your data are plot(x) and hist(x)

plot(count_summary_rna$`0h_1_RNA_S13_umi.bam`)

plot(count_summary_rna$`0h_1_RNA_S13_umi.bam`, count_summary_rna$`0h_2_RNA_S14_umi.bam`)

plot(count_summary_rna$`0h_1_RNA_S13_umi.bam`, count_summary_rna$`14h_1_RNA_S22_umi.bam`)

plot(log2(count_summary_rna$`0h_1_RNA_S13_umi.bam`), log2(count_summary_rna$`14h_1_RNA_S22_umi.bam`))

plot(log2(count_summary_rna$`0h_1_RNA_S13_umi.bam`), log2(count_summary_rna$`0h_2_RNA_S14_umi.bam`))

hist(count_summary_rna$`0h_1_RNA_S13_umi.bam`)

hist(log2(count_summary_rna$`0h_1_RNA_S13_umi.bam`))

4. Tidying and transforming data

What is tidy data?

“Tidy datasets are all alike but every messy dataset is messy in its own way” Hadley Wickham

Reshaping data

The four verbs to keep in mind for reshaping data with tidyr are:
- spread
- gather
- separate
- unite

Transforming data

The verbs to keep in mind for transforming data with dplyr are:
- select
- rename - arrange
- filter
- mutate
- group_by - summarise - The pipe: %>%

Select columns
data <- count_summary_rna %>%
                  select(-Chr, -Start, -End, -Strand)
Renaming columns
names(data)
##  [1] "Geneid"                "Length"               
##  [3] "0h_1_RNA_S13_umi.bam"  "0h_2_RNA_S14_umi.bam" 
##  [5] "0h_3_RNA_S15_umi.bam"  "14h_1_RNA_S22_umi.bam"
##  [7] "14h_2_RNA_S23_umi.bam" "14h_3_RNA_S24_umi.bam"
##  [9] "4h_1_RNA_S16_umi.bam"  "4h_2_RNA_S17_umi.bam" 
## [11] "4h_3_RNA_S18_umi.bam"  "8h_1_RNA_S19_umi.bam" 
## [13] "8h_2_RNA_S20_umi.bam"  "8h_3_RNA_S21_umi.bam"
data <- data %>% rename(
                        hour00_rep1 = `0h_1_RNA_S13_umi.bam`,
                        hour00_rep2 = `0h_2_RNA_S14_umi.bam`,
                        hour00_rep3 = `0h_3_RNA_S15_umi.bam`,
                        hour04_rep1 = `4h_1_RNA_S16_umi.bam`,
                        hour04_rep2 = `4h_2_RNA_S17_umi.bam`,
                        hour04_rep3 = `4h_3_RNA_S18_umi.bam`,
                        hour08_rep1 = `8h_1_RNA_S19_umi.bam`,
                        hour08_rep2 = `8h_2_RNA_S20_umi.bam`,
                        hour08_rep3 = `8h_3_RNA_S21_umi.bam`,
                        hour14_rep1 = `14h_1_RNA_S22_umi.bam`,
                        hour14_rep2 = `14h_2_RNA_S23_umi.bam`,
                        hour14_rep3 = `14h_3_RNA_S24_umi.bam`)
Rearrange columns
## Using dplyr
# data <- data %>% select(Geneid, Length, hour00_rep1, hour00_rep2, hour00_rep3, hour04_rep1, hour04_rep2, hour04_rep3, hour08_rep1, hour08_rep2, hour08_rep3, hour14_rep1, hour14_rep2, hour14_rep3)

## Alternative with baseR
data <- data[, c(1:5, 9:14, 6:8)]
Arrange rows by counts
data <- data %>% arrange(hour14_rep1)
data <- data %>% arrange(desc(hour14_rep1))
Filter rows by counts
data_sub <- data %>% filter(hour14_rep1 > 2000)
Mutate an observation.

Example: Log transformation or sum of multiple columns

data_sum <- data %>% mutate(sum_hour00 = sum(hour00_rep1, hour00_rep2, hour00_rep3))
Merging dataframes
  • join to map ensembl IDs to gene_symbols

In order to merge the dataframe, we need to first get the Ensembl id in our dataframe to match the Ensembl id in the mapping file. Let’s take a look at these.

head(data$Geneid)
## [1] "ENST00000550420.2_116_632"   "ENST00000544301.6_414_1812" 
## [3] "ENST00000550420.2_722_1007"  "ENST00000370651.5_1154_1673"
## [5] "ENST00000357726.4_28_1477"   "ENST00000318203.9_698_1997"
head(ensembl_to_genesymbol$`Ensembl Transcript ID`)
## [1] "ENST00000361390" "ENST00000361453" "ENST00000361624" "ENST00000361739"
## [5] "ENST00000361851" "ENST00000361899"

The Ensembl transcript id in our data table has trailing characters that need to be removed. This is an example of “string manipulation”. stringr is a very powerful package for string manipulation in R that would take an entire class (at least) to go through. Instead, we are going to use tidyr’s separate to accomplish this task.

data <- data %>% separate(Geneid, sep = "\\.", into = c("Geneid", "extra"))

head(data$Geneid)
## [1] "ENST00000550420" "ENST00000544301" "ENST00000550420" "ENST00000370651"
## [5] "ENST00000357726" "ENST00000318203"

It looks like we have the Geneid in the form we need for merging. Now would be a good time to also get the column names to match to enable easy merging.

names(ensembl_to_genesymbol)
## [1] "Ensembl Gene ID"                           
## [2] "Ensembl Transcript ID"                     
## [3] "Transcript length (including UTRs and CDS)"
## [4] "CDS Length"                                
## [5] "Associated Gene Name"
ensembl_to_genesymbol <- ensembl_to_genesymbol %>% rename(
  ensembl_gene = `Ensembl Gene ID`,
  ensembl_transcript = `Ensembl Transcript ID`,
  transcript_length = `Transcript length (including UTRs and CDS)`, 
  cds_length = `CDS Length`, 
  gene_symbol = `Associated Gene Name`)

# Remove unnecessary columns
ensembl_to_genesymbol <- ensembl_to_genesymbol %>% select(ensembl_transcript, gene_symbol)

names(data)
##  [1] "Geneid"      "extra"       "Length"      "hour00_rep1" "hour00_rep2"
##  [6] "hour00_rep3" "hour04_rep1" "hour04_rep2" "hour04_rep3" "hour08_rep1"
## [11] "hour08_rep2" "hour08_rep3" "hour14_rep1" "hour14_rep2" "hour14_rep3"
data <- data %>% rename(
  ensembl_transcript = Geneid)

data_genelevel <- left_join(data, ensembl_to_genesymbol)
## Joining, by = "ensembl_transcript"
Summarise the counts via sum to collapse duplicate transcript ids.
# remove unnecessary columns
data_genelevel <- data_genelevel %>% select (-extra, -Length, -ensembl_transcript)

# rearrange to get gene_symbol first
data_genelevel <- data_genelevel %>% select (gene_symbol, everything())

names(data_genelevel)
##  [1] "gene_symbol" "hour00_rep1" "hour00_rep2" "hour00_rep3" "hour04_rep1"
##  [6] "hour04_rep2" "hour04_rep3" "hour08_rep1" "hour08_rep2" "hour08_rep3"
## [11] "hour14_rep1" "hour14_rep2" "hour14_rep3"
# check number of transcripts vs. number of genes
length(data_genelevel$gene_symbol)
## [1] 20025
length(unique(data_genelevel$gene_symbol))
## [1] 10568
# group_by genes and summarise counts
nrow(data_genelevel)
## [1] 20025
data_genelevel <- data_genelevel %>% 
  group_by(gene_symbol) %>% 
  summarise(hour00_rep1 = sum(hour00_rep1), 
            hour00_rep2 = sum(hour00_rep2),
            hour00_rep3 = sum(hour00_rep3),
            hour04_rep1 = sum(hour04_rep1),
            hour04_rep2 = sum(hour04_rep2),
            hour04_rep3 = sum(hour04_rep3),
            hour08_rep1 = sum(hour08_rep1),
            hour08_rep2 = sum(hour08_rep2),
            hour08_rep3 = sum(hour08_rep3),
            hour14_rep1 = sum(hour14_rep1),
            hour14_rep2 = sum(hour14_rep2),
            hour14_rep3 = sum(hour14_rep3))

nrow(data_genelevel)
## [1] 10568

Homework

TBD